package com.kdtech.analyse.NewsPaper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.analyse.AnalyseNews;
import com.kdtech.analyse.JSoupUtils;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.RegexUtils;
import com.kdtech.utils.StringUtils;

/**
 * http://paper.hbjjrb.com/html/2012-12/04/node_62.htm 河北经济日报
 * @author KK
 *
 */
public class HbjjrbNewsPaperAnalyse implements AnalyseNews {


	
	public boolean isDetailPage(String url) {
		String[] regex = {
				"http://paper.hbjjrb.com/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm",
				"http://paper.hbjjrb.com/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm\\?div=-1",
				"http://www.hbjjrb.com/.*/.*/[0-9]{6}/[0-9].*.html"
				};
		return RegexUtils.matchAny(url, regex);
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {

		String html = urlMeta.getHtml();
		String url = urlMeta.getUrl();


		String title = null;
		String content = null;
		String author = null;
		Long date = null;
		Document doc = Jsoup.parse(html);

		title =doc.select("div#article1 table tbody tr td table tbody tr td table tbody tr td strong").text();
		if(StringUtils.isBlank(title)){
			title=doc.select("td.main_ArticleTitle").text();
		}
		if(StringUtils.isBlank(title)){
			title=doc.select("h1").text();
		}
		date = DateUtils.matchDate(doc.select("div#LaiYuan").text());
		if (date==null) {
			date = DateUtils.matchDate(url);
		}
		content =HtmlCleaner.getContentHtml(url,doc.select("div#ozoom"));
		if(StringUtils.isBlank(content)){
			content=HtmlCleaner.getContentHtml(url,doc.select("td#fontzoom"));
		}
		if(StringUtils.isBlank(content)){
			content=HtmlCleaner.getContentHtml(url,doc.select("div#NaiRong_left_Content"));
		}
		author = JSoupUtils.matchAuthor(doc, "来源：");
		
		
		NewsMeta newspaper = new NewsMeta();
		newspaper.setUrl(url);
		newspaper.setTitle(StringUtils.trimSpace(title));
		newspaper.setAuthor(author);
		newspaper.setContent(StringUtils.trimSpace(content));
		newspaper.setDate(date);
		return newspaper;


	}

	
	public boolean isTaskPage(String url) {
		String[] regex = {
					"http://paper.hbjjrb.com/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm"
				};
		return RegexUtils.matchAny(url, regex);
	}

	
}
